CD7: full spectrum dimensionality reduction of neutrophils¶

In this notebook, we perform a clustering analysis of the CD7 data on features extracted from SCIP.

In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
from scip_workflows.common import *
In [22]:
import warnings
import anndata
import flowutils
import scanpy
import scipy.stats
from kneed import KneeLocator
from scipy.cluster import hierarchy
from scipy.cluster.hierarchy import fcluster
from scipy.spatial.distance import squareform
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
from sklearn.preprocessing import scale

scanpy.settings.verbosity = 3
In [4]:
from scip.features import intensity

props = intensity.props.copy()
props.remove("kurtosis")
props.remove("skewness")
In [5]:
def asinh_scale(x, t):
    return scale(
        flowutils.transforms.asinh(x, channel_indices=None, t=t, m=4.5, a=1),
        with_std=False,
    )
In [6]:
plt.rcParams["figure.dpi"] = 150

Data¶

In [59]:
try:
    features = snakemake.input.features
    index = snakemake.input.index
    columns = snakemake.input.columns
    fillna = bool(int(snakemake.wildcards.fillna))
    output_adata = snakemake.output.adata
    output_intensity_distribution = snakemake.output.intensity_distribution
except NameError:
    # data_dir = Path("/data/gent/vo/000/gvo00070/vsc42015/datasets/cd7/800/results/scip/202203221745/")
    data_dir = Path("/home/maximl/scratch/data/vsc/datasets/cd7/800/scip/061020221736/")
    features = data_dir / "features.parquet"
    index = data_dir / "indices" / "index.npy"
    columns = data_dir / "indices" / "columns.npy"
    fillna = False
    output_adata = data_dir / f"CD7_adata_{int(fillna)}.pickle"
    output_intensity_distribution = data_dir / "figures" / f"CD7_intensity_distribution.pickle"
In [65]:
df = pq.read_table(features).to_pandas()
df = df.set_index(["meta_panel", "meta_replicate", "meta_P", "meta_id"])
df = df.loc["D"]
df = df[[c for c in numpy.load(columns, allow_pickle=True) if c in df.columns]]
df = df.loc[numpy.load(index, allow_pickle=True)]
df = df.sort_index()

df.shape
Out[65]:
(29959, 1188)

Removing zero variance features¶

from sklearn.feature_selection import VarianceThreshold

In [66]:
var = VarianceThreshold().fit(df.filter(regex="feat"))
In [67]:
df = pandas.concat(
    [df.filter(regex="feat").iloc[:, var.get_support()], df.filter(regex="meta")],
    axis=1,
)

NaN values¶

In [68]:
df.isna().all(axis=0).any()
Out[68]:
False
In [69]:
df.filter(regex="feat").isna().all(axis=1).sum()
Out[69]:
0
In [70]:
seaborn.histplot(data=df.isna().sum())
Out[70]:
<AxesSubplot:ylabel='Count'>
In [71]:
if fillna:
    df = df.fillna(0)
else:
    df = df.drop(columns=df.columns[df.isna().sum() > 0])

Analysis¶

In [72]:
obs = df.filter(regex="meta").reset_index()
obs.index = df.index
adata = anndata.AnnData(df.filter(regex="feat").astype(numpy.float32), obs=obs)
adata.raw = adata.copy()

adata.obs["meta_replicate"] = adata.obs["meta_replicate"].astype("category")
In [73]:
markers = [col for col in adata.var.index if col.startswith("feat_sum")]
In [74]:
adata_pre = adata.copy()
In [75]:
%%time
scanpy.pp.scale(adata_pre)
scanpy.tl.pca(adata_pre, svd_solver='arpack')
scanpy.pp.neighbors(adata_pre, n_neighbors=30)
scanpy.tl.umap(adata_pre)
scanpy.pl.umap(adata_pre, color=["meta_replicate"])
computing PCA
    with n_comps=50
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:08)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:44)
CPU times: user 1min 56s, sys: 15.4 s, total: 2min 11s
Wall time: 54.6 s
In [76]:
discrete = ["median", "area", "euler"]
discrete_cols = [c for c in adata.var_names if any(d in c for d in discrete)]
discrete_cols_i = [
    i for i, c in enumerate(adata.var_names) if any(d in c for d in discrete)
]
In [77]:
adata.var["is_marker"] = [
    any(n.endswith("feat_combined_sum_%s" % m) for m in ["DAPI", "EGFP", "RPe", "APC"])
    for n in adata.var_names
]
adata.var["do_asinh"] = [
    (any(m in n for m in ["DAPI", "EGFP", "RPe", "APC"]) and any(o in n for o in props))
    for n in adata.var_names
]
In [78]:
%%time
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    sc_df = scanpy.get.obs_df(adata, keys=adata.var_names.to_list())
    sc_df[adata.var_names[adata.var.do_asinh].to_list()] = sc_df[adata.var_names[adata.var.do_asinh]].groupby(["meta_replicate", "meta_P"]).transform(lambda x: asinh_scale(x, x.max()))
    sc_df[adata.var_names[~adata.var.do_asinh].to_list()] = sc_df[adata.var_names[~adata.var.do_asinh]].groupby(["meta_replicate", "meta_P"]).transform(lambda x: scale(x))

adata = anndata.AnnData(X=sc_df, obs=adata.obs, var=adata.var, raw=adata.raw)
CPU times: user 1min 16s, sys: 1.01 s, total: 1min 17s
Wall time: 1min 17s
In [79]:
def map_names(a):
    return {
        "feat_combined_sum_DAPI": "DAPI",
        "feat_combined_sum_EGFP": "CD45",
        "feat_combined_sum_RPe": "Siglec 8",
        "feat_combined_sum_APC": "CD15",
    }[a]
In [80]:
aligned_df = scanpy.get.obs_df(
    adata, keys=adata.var_names[adata.var.is_marker].to_list()
).reset_index()

melted_df = pandas.melt(
    aligned_df,
    id_vars=["meta_P", "meta_replicate"],
    value_vars=adata.var_names[adata.var.is_marker].to_list(),
)
melted_df.variable = melted_df.variable.apply(map_names)

grid = seaborn.FacetGrid(
    data=melted_df,
    col="meta_replicate",
    row="variable",
    sharey="row",
    aspect=1.5,
    margin_titles=True,
)
grid.map_dataframe(seaborn.stripplot, x="meta_P", y="value", size=1, alpha=0.5)

grid.set_axis_labels("Well image position", "Fluorescence intensity")
grid.set_titles(col_template="Replicate {col_name}", row_template="{row_name}")

grid.add_legend()

plt.savefig(output_intensity_distribution, bbox_inches='tight', pad_inches=0)
Out[80]:
<seaborn.axisgrid.FacetGrid at 0x7fab8c380fd0>
In [81]:
def rediscritize(v):
    bin_idx = numpy.digitize(v, bins=numpy.histogram_bin_edges(v))
    bin2mu = [numpy.mean(v[bin_idx == i]) for i in range(1, numpy.max(bin_idx) + 1)]
    return numpy.fromiter((bin2mu[i - 1] for i in bin_idx), dtype=float)
In [82]:
adata = adata[adata.to_df().isna().sum(axis=1) == 0].copy()
In [83]:
X = adata.X.copy()
X[:, discrete_cols_i] = numpy.apply_along_axis(rediscritize, 0, X[:, discrete_cols_i])
/srv/scratch/maximl/mambaforge/envs/scip/lib/python3.9/site-packages/numpy/core/fromnumeric.py:3474: RuntimeWarning: Mean of empty slice.
  return _methods._mean(a, axis=axis, dtype=dtype,
/srv/scratch/maximl/mambaforge/envs/scip/lib/python3.9/site-packages/numpy/core/_methods.py:189: RuntimeWarning: invalid value encountered in true_divide
  ret = ret.dtype.type(ret / rcount)
In [84]:
%%time
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    mi_post = mutual_info_classif(X=X, y=adata.obs["meta_replicate"], discrete_features=discrete_cols_i, n_neighbors=30, random_state=0)
    mi_post = pandas.Series(mi_post, index=adata.var_names).sort_values(ascending=False)
CPU times: user 3min 44s, sys: 1.28 s, total: 3min 45s
Wall time: 3min 45s
In [85]:
kneedle = KneeLocator(
    numpy.arange(len(mi_post)),
    mi_post,
    S=40,
    curve="convex",
    direction="decreasing",
    online=False,
)
elbow_value = mi_post.iloc[int(kneedle.knee)]

kneedle.plot_knee()
In [86]:
selected_mi = mi_post[mi_post < elbow_value].index.values
len(selected_mi) / len(mi_post)
Out[86]:
0.8460721868365181
In [87]:
len(mi_post[mi_post > elbow_value].index.values)
Out[87]:
144
In [88]:
adata2 = adata[:, selected_mi].copy()

Feature clustering¶

In [89]:
corr = scipy.stats.spearmanr(adata2.X).correlation
seaborn.clustermap(corr, vmin=-1, vmax=1, center=0)
/srv/scratch/maximl/mambaforge/envs/scip/lib/python3.9/site-packages/seaborn/matrix.py:657: UserWarning: Clustering large matrix with scipy. Installing `fastcluster` may give better performance.
  warnings.warn(msg)
Out[89]:
<seaborn.matrix.ClusterGrid at 0x7fab5c1273d0>
In [90]:
corr = (corr + corr.T) / 2
numpy.fill_diagonal(corr, 1)


def feat_from_corr(corr):
    # We convert the correlation matrix to a distance matrix before performing
    # hierarchical clustering using Ward's linkage.
    distance_matrix = 1 - numpy.abs(corr)
    dist_linkage = hierarchy.ward(squareform(distance_matrix))

    clusters = fcluster(dist_linkage, 0.1, criterion="distance").ravel()

    indices = []
    for c in numpy.unique(clusters):
        ci = numpy.flatnonzero(clusters == c)
        indices.append(ci[adata2.X[:, ci].std(axis=0).argmax()])

    features = [adata2.var_names[i] for i in indices]
    return features, indices, clusters


features, indices, clusters = feat_from_corr(corr)
In [91]:
seaborn.clustermap(corr[indices, :][:, indices], vmin=-1, vmax=1, center=0)
Out[91]:
<seaborn.matrix.ClusterGrid at 0x7fab889c4550>
In [92]:
adata2.var["selected_corr"] = False
adata2.var.loc[features, "selected_corr"] = True
adata2.var["feature_clusters"] = clusters
In [93]:
scanpy.pp.scale(adata2)
In [94]:
adata3 = adata2[:, adata2.var.selected_corr].copy()
In [95]:
scanpy.tl.pca(adata3, svd_solver="arpack", random_state=0)
scanpy.pp.neighbors(adata3, n_neighbors=10, method="umap", random_state=0)
computing PCA
    with n_comps=50
    finished (0:00:00)
computing neighbors
    using 'X_pca' with n_pcs = 50
    finished: added to `.uns['neighbors']`
    `.obsp['distances']`, distances for each pair of neighbors
    `.obsp['connectivities']`, weighted adjacency matrix (0:00:03)
In [96]:
%%time
resolutions = [0.5, 0.75]
for res in resolutions:
    scanpy.tl.leiden(adata3, resolution=res, key_added=f"leiden_{res}", random_state=0)
running Leiden clustering
    finished: found 9 clusters and added
    'leiden_0.5', the cluster labels (adata.obs, categorical) (0:00:06)
running Leiden clustering
    finished: found 10 clusters and added
    'leiden_0.75', the cluster labels (adata.obs, categorical) (0:00:07)
CPU times: user 13.4 s, sys: 314 ms, total: 13.7 s
Wall time: 13.6 s
In [97]:
scanpy.tl.umap(adata3, random_state=0)
computing UMAP
    finished: added
    'X_umap', UMAP coordinates (adata.obsm) (0:00:29)
In [98]:
scanpy.pl.umap(
    adata3,
    color=["leiden_0.5", "leiden_0.75", "meta_replicate"],
    legend_loc="on data",
)
In [99]:
adata3.obs["leiden"] = adata3.obs["leiden_0.75"]
In [100]:
seaborn.countplot(data=adata3.obs, x="leiden", hue="meta_replicate")
Out[100]:
<AxesSubplot:xlabel='leiden', ylabel='count'>
In [101]:
adata_out = anndata.AnnData(
    X=adata2.X, var=adata2.var, obs=adata3.obs, obsm=adata3.obsm, raw=adata2.raw
)
In [102]:
import pickle

with open(output, "wb") as fh:
    pickle.dump(adata_out, fh)
In [ ]: